##Exploratory Data Analysis- Section 4

###Analytical Graphics Principle 1: show comparisons, and don’t forget your control Principle 2: Show causality, mechanism, explanation for differences Principle 3: show multi-variate data Principle 4: Show what you need, and don’t be limited by the tools Principle 5: describe and document your evidence (code, labels, etc…) Principle 6: content is king- the story you tell is most important

##Exploratory Graphs Why? Look at trends, debug an analysis, communicate results, suggest modelling strategies. - made quickly, many are made at once, axes are cleaned later, and colors will change.

## par() changes the global graphics parameters for all graphics. Requires a reset by using the code below: 
old.par <- par(mfrow = c(1,1), mfcol = c(1,1), mar = c(5, 4, 4, 2))
par(old.par) ##This code allows you to reset the parameters of a plot in R. 

## The list below are all par() commands. 
# ls() is the orientation of axis labels on a plot
# bg() is the background color
# mar() defines the margin size
# oma() is the outer margin size
# mfrow() is numbers of plots per row, column, filled row-wise. 
# mfcol() is numbers of plots per row. column, filled column-wise. 

boxplot(mtcars$mpg, col= "blue") ##example of a boxplot
abline(h = max(mtcars$mpg), col= "orange", lwd = 2)  ##h here is horizontal line addition
abline(h = mean(mtcars$mpg), col ="green", lwd = 2)  ## lwd is the line width/thickness

##_______________________
boxplot(mpg ~ cyl, data = mtcars, col = "orange")  ##plotting the relationship " ~ " between to variables, where mpy is y axis, and cyl is x axis. 
abline(h = mean(mtcars$mpg), col = "blue", lwd = 2)

##_______________________
mtcars2 <- transform(mtcars, cyl= factor(cyl))
boxplot(disp ~ cyl, data = mtcars2, xlab = "Cylinders", ylab = "Displacement") ##labeling axes in base R. 

##_______________________
hist(mtcars$mpg, col= "orange", breaks = 20, main = "Miles per Gallon accross cars") ##example of a histogram
rug(mtcars$mpg)
abline(v = mean(mtcars$mpg), col ="purple", lwd = 3)  ##v stands for vertical line. 
abline(v = max(mtcars$mpg), col = "yellow", lwd = 2)

##_______________________
barplot(table(mtcars$gear), col= "turquoise", main = "Number of Gears per Car")

##_______________________
mtcars <- na.omit(mtcars)
with(mtcars, plot(mpg, disp, col=mpg))
with(subset(mtcars, disp > 400), points(mpg, disp, col = "black")) ##xy scatter plot comparing two column variables. we are taking a subset of the mtcars where displacement is greater than 400, and making those points black.  

##_______________________
##Adding more details to standard plot, e.g. using the legend expression, and adding regression. 
with(mtcars, plot(mpg, disp, main = "Association Between MPG and Displacement", type = "n"))
with(subset(mtcars, cyl == 4), points(mpg, disp, col = "green"))
with(subset(mtcars, cyl != 4), points(mpg, disp, col = "red"))
legend("topright", pch = 1, col = c("green", "red"), legend = c("4 Cylinders", "More than 4 cylinders")) #pch changes the characteristic of the circle/point in the plot. 
model <- lm(disp ~ mpg, mtcars) ##creating a liner model between two varialbes, mpg and displacement
abline(model, col = "black", lty = 2)

#abline(h = mean(mtcars$weight), col="orange", lty = 2)  ##lty is a dotted line. 
#abline(v = mean(mtcars$mpg), col="yellow", lty=2)

##_______________________
par(mfrow = c(3,1), mar = c(5,4,2,1))  ##more to learn here  ##starting at bottom and moving clockwise is how margins are oriented. For mfrow, it states one row, three column.
mtcars$gear <- as.integer(mtcars$gear)
with(subset(mtcars, gear == 3), plot(mpg, wt, col = wt, main = "Three Gears"))
with(subset(mtcars, gear == 4), plot(mpg, wt, col = wt, main = "Four Gears"))
with(subset(mtcars, gear == 5), plot(mpg, wt, col = wt, main = "Five Gears"))

##_______________________
old.par <- par(mfrow = c(1,1), mfcol = c(1,1), mar = c(5, 4, 4, 2))
par(old.par) ##resetting
par(mfrow = c(1,3), oma = c(0,0,2,0))  ##gives us a little room on the outer margin for a main title/header.
with(airquality, {
  plot(Wind, Ozone, main = "Ozone vs. Wind")
  plot(Solar.R, Ozone, main = "Ozone vs. Solar Radiation")
  plot(Temp, Ozone, main = "Ozone and Temperature")
  mtext("Ozone vs. Weather in NYC", outer = TRUE) ##adds a main title/header to this series of three plots. 
})

##_______________________

##Standard plot systems
old.par <- par(mfrow = c(1,1), mfcol = c(1,1), mar = c(5, 4, 4, 2))
par(old.par) ##This code allows you to reset the parameters of a plot in R. 
with(cars, plot(speed, dist))

##_______________________
##Standard lattice system
#e.g. xyplot and bwplot
library(lattice)

xyplot(mpg ~ disp | gear, data = mtcars, layout = c(4,1)) ##separating x/y plot by gear

##_______________________
##ggplot2 plotting
library(ggplot2)
data(mtcars)
qplot(disp, mpg, data = mtcars)

Exporting/Copying plots to files

Once the plot has been generated… dev.copy(png, file = “filename.png”) dev.off()

Histogram Example, Power Grid

##code for histogram of power data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?")  ##separated by semi colons, and the na's are marked as "?".
data_table$Date_Time <- paste(data_table$Date, data_table$Time)  ##Paste the first two rows together and renames them as Date_Time
      data_table$Global_active_power <- as.numeric(data_table$Global_active_power)  ##changes needed data to numeric. 
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S")  #defining time string as time. 
         data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ]  ##subsets the data for the two dates in question. 
               head(data_table, n=20) ##quick gut check
hist(data_table$Global_active_power, col= "red", breaks = 14, main = "Global Active Power"
      , xlab = "Global Active Power (kilowatts)")

            dev.copy(png, file = "plot1_R.png")
## quartz_off_screen 
##                 3
            dev.off()
## quartz_off_screen 
##                 2

Plotting example, Line Plot, Power Grid

##code for line plot of power data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?")  ##separated by semi colons, and the na's are marked as "?".
data_table$Date_Time <- paste(data_table$Date, data_table$Time)  ##Paste the first two rows together and renames them as Date_Time
      data_table$Global_active_power <- as.numeric(data_table$Global_active_power)  ##changes needed data to numeric. 
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S")  #defining time string as time. 
       data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ]  ##subsets the data for the two dates in question. 
          head(data_table, n=20) ##quick gut check
with(data_table, plot(Date_Time, Global_active_power, xlab = "", ylab = "Global Active Power (kilowatts)", type ="l", lwd = 2)) ##defining the plot type as line or "l", and width of 2. 

      dev.copy(png, file = "plot2_R.png")
## quartz_off_screen 
##                 3
          dev.off()
## quartz_off_screen 
##                 2

###Plotting + Lines (multi-column) Example, Power Grid

##code for three-variable plot of metering data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?")  ##separated by semi colons, and the na's are marked as "?".
    data_table$Date_Time <- paste(data_table$Date, data_table$Time)  ##Paste the first two rows together and renames them as Date_Time
data_table$Global_active_power <- as.numeric(data_table$Global_active_power)  ##changes needed data to numeric. 
    data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S")  #defining time string as time. 
        data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ]  ##subsets the data for the two dates in question. 
            head(data_table, n=20) ##quick gut check
with(data_table, plot(Date_Time, Sub_metering_1, col= "black", xlab = "", ylab ="Energy sub metering", type = "l"))
                      lines(data_table$Date_Time, data_table$Sub_metering_2, col="orange", type="l")
                      lines(data_table$Date_Time, data_table$Sub_metering_3, col="blue", type="l")
                          legend("topright", c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"), col = c("black", "orange", "blue"), lty = 1)

                              dev.copy(png, file = "plot3_R.png")
## quartz_off_screen 
##                 3
                               dev.off() 
## quartz_off_screen 
##                 2

###2x2 Plot Example, Power Grid

##code for 2x2 plot of metering data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?")  ##separated by semi colons, and the na's are marked as "?".
    data_table$Date_Time <- paste(data_table$Date, data_table$Time)  ##Paste the first two rows together and renames them as Date_Time
       data_table$Global_active_power <- as.numeric(data_table$Global_active_power)  ##changes needed data to numeric. 
       data_table$Global_reactive_power <- as.numeric(data_table$Global_reactive_power)
       data_table$Voltage <- as.numeric(data_table$Voltage)
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S")  #defining time string as time. 
      data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ]  ##subsets the data for the two dates in question. 
          head(data_table, n=20) ##quick gut check
par(mfrow = c(2,2))
with(data_table, plot(Date_Time, Global_active_power, xlab = "", ylab = "Global Active Power (kilowatts)", type ="l", lwd = 2))
        with(data_table, plot(Date_Time, Voltage, xlab = "", ylab = "Voltage", type ="l", lwd = 1))
            with(data_table, plot(Date_Time, Sub_metering_1, col= "black", xlab = "", ylab ="Energy sub metering", type = "l"))
            lines(data_table$Date_Time, data_table$Sub_metering_2, col="orange", type="l")
            lines(data_table$Date_Time, data_table$Sub_metering_3, col="blue", type="l")
            legend("topright", c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"), col = c("black", "orange", "blue"))
                  with(data_table, plot(Date_Time, Global_reactive_power, xlab = "", type ="l", lwd = 1))

                      dev.copy(png, file = "plot4_R.png")
## quartz_off_screen 
##                 3
                       dev.off() 
## quartz_off_screen 
##                 2

##Lattice Plotting xyplot( y~ x | f * g, data) is the standard notation for a lattice plot. left of ~ is trhe y-axis, and on the right is the x-axis f and g are conditioning variables where the * indicates an interaction between two variables

library(lattice)
xyplot(mpg ~ disp, data =mtcars) ##simple scatter plot example using mtcars. 

xyplot(Ozone ~ Wind, data = airquality)

##_____________________
airquality <- transform(airquality, Month = factor(Month))  ##Month was turned into a factor that we will ultimately parse the data into. 
xyplot(Ozone ~ Wind | Month, data = airquality, layout = c(5,1))  ##xyplot, where month was listed as a conditioning variable. 

##ggplot2 plotting Grammar of graphics by Leland Wilkinson (ggplot2.org) aesthetics: size, shape, color geoms: points, lines ###qplot

library(ggplot2) 
str(mpg) ##example dataset that comes with ggplot2
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...
qplot(displ, hwy, data = mpg)  ##most basic of "base" plots for ggplot2. displ = x-coordinate, hwy = y-coordinate. 

qplot(displ, hwy, data = mpg, color = drv)  ##adds a legend where colors are specific to variable drv (front, rear or 4w)

qplot(displ, hwy, data = mpg, geom = c("point", "smooth"))  ## grey = 95% confidence interval, and "smooth" is simply the function for the line. 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

qplot( hwy, data = mpg, fill = drv)  ##when only one variable is defined, it will produce a histogram. fill is an alternative to color when using a histogpram. 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##Facets_______________
qplot(displ, hwy, data = mpg, facets = .~drv) ## here, the facets function splits "." or everything by ~ the variable drv

qplot(hwy, data= mpg, facets = drv~., binwidth= 2, color = drv)## here, histogram where facet used to separate drv ~ by "." or everything

##Geoms______________________
qplot(hwy, data = mpg, geom="density", color = drv) ##example of geom "density" which is a smooth plot alternative to histogram, can accept y variable only, notx and y

qplot(displ, hwy, data = mpg, color = drv) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

qplot(displ, hwy, data = mpg, facets = .~drv, color = drv) + geom_smooth(method = "lm") ## using facets to separate out the regression by drv. lm for linear model.
## `geom_smooth()` using formula 'y ~ x'

qplot(displ, hwy, data = mpg, facets = .~ cyl, color = cyl, geom = c("point", "smooth"), method = "lm")  ##again, point and smooth gives us the confidence interval, along with a line "lm" for linear model. 
## Warning: Ignoring unknown parameters: method
## `geom_smooth()` using formula 'y ~ x'

##______________________

ggplot

g <- ggplot(mpg, aes(hwy, cyl))  ##aes stands for the aesthetic elements, i.e. what and it's orientation.
summary(g)
## data: manufacturer, model, displ, year, cyl, trans, drv, cty, hwy, fl,
##   class [234x11]
## mapping:  x = ~hwy, y = ~cyl
## faceting: <ggproto object: Class FacetNull, Facet, gg>
##     compute_layout: function
##     draw_back: function
##     draw_front: function
##     draw_labels: function
##     draw_panels: function
##     finish_data: function
##     init_scales: function
##     map_data: function
##     params: list
##     setup_data: function
##     setup_params: function
##     shrink: TRUE
##     train_scales: function
##     vars: function
##     super:  <ggproto object: Class FacetNull, Facet, gg>
##print(g) will return an error because there is  insufficient information to provide an output. 
p <- g + geom_point()  ##point layer added to g
print(p)

 g + geom_point() + geom_smooth()  ##smooths out and adds confidence bands. 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

 g + geom_point() + geom_smooth(method = "lm") ##adds a linear model best-fit line
## `geom_smooth()` using formula 'y ~ x'

 ##___________________
 h <- ggplot(mpg, aes(displ, drv))
 h + geom_point(color = "forestgreen", size = 4, alpha= 1/2) + ##color of this variable is a constant - green
   #facet_grid(. ~ cyl) + 
   geom_smooth(method = "lm") + 
   xlab("Displacement") + ##labels the x axis
  ylab("Drive_Type") +
   ggtitle("Engine Attributes")
## `geom_smooth()` using formula 'y ~ x'

        h <- ggplot(mpg, aes(displ, cty))
        h + geom_point(aes(color = cyl), size = 4, alpha= 1/2) +   ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that. 
          theme_classic(base_family = "Avenir", base_size = 14) + ##changes the base font for the entire plot
           geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") + 
           xlab("Displacement") + ##labels the x axis
            ylab("City_MPG") +
            ggtitle("Engine Attributes") +
            ylim(-5, 30)  ##sets a limit to the y axis, but in ggplot, that will subset the data and remove outlier points. 
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

                ## + coor_cartesian(ylim = c(-5, 30))  This is a way of preventing sub-setting and outliers are included in the finished plot. 
  
        ##facet_wrap( column/category ~ column/category2, nrow = y, ncol = x)

##Heirarchical Clustering [i.e. cluster analysis] - How do we define close? - Continuous = euclidian distance (straight line) a^2 + b^2 = c^2 - Continuous = correlation similarity - Binary = manhattan distance. |A1 - A2| + |B1 - B2| + …. |Z1 - Z2| absolute values - think walking city blocks. - How do we group things? - How to we visualize the groupings? - How do we interpret the grouping?

Taking an agglomerative approach (bottom up), find closes two things, put them into larger clusters, find next closest, etc… - Required a defined distance - A merging approach Produced: a tree showing how close things are to each other.

e.g. dist(mpg) gives you the pairwise differences between each variable in each column.

###Dendrogram - rudimentary

distxy <- dist(mpg$cty)
hClustering <- hclust(distxy)
plot(hClustering)  ##this clustering does not necessarily demonsrate anything significant in terms of output or trends, but will at least produce a figure. 

          #clusterMatrix <- as.matrix(hClustering[ , 8:9])
          #heatmap(clusterMatrix)



set.seed(1234)
par(mar = c(0,0,0,0))
x <- rnorm(12, mean= rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2)
plot(x,y, col= "orange", pch= 5, cex = 2)
text(x + 0.08, y + 0.08, labels = as.character(1:12))

      Data_thing <- data.frame( x=x, y=y)
      data_dist <- dist(Data_thing)
      thing_cluster <- hclust(data_dist)
      plot(thing_cluster)

           thing_matrix <- as.matrix(Data_thing) [sample(1:12), ]
           heatmap(thing_matrix)  

##k-means Clustering & Dimension reduction Partitions -fix a number of clusters - get “centroids” of each cluster - assign things to closest centroid - recalculate centroids and iterate Requires - A defined distance metric - a number of clusters - an initial guess as to cluster centroids Produces - Final estimate of cluster and assignments

###kmeans Clustering

##Generating random data here
set.seed(1234)
par(mar = c(0,0,0,0))
x <- rnorm(12, mean= rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2)
plot(x,y, col= "orange", pch= 5, cex = 2)
text(x + 0.08, y + 0.08, labels = as.character(1:12))

##kmeans function/argument for clustering analyses
data_thing <- data.frame(x = x, y = y)
kmeansObject <- kmeans(data_thing, centers = 3)  ##caling kmeans argument, three centers/groups
names(kmeansObject)
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
par(mar = rep(0.2, 4))  #adjustes the margins
plot( x, y, col= kmeansObject$cluster, pch = 19, cex = 2)  #plots the kmeans function, calling out the clusters
points(kmeansObject$centers, col = 1:3, pch = 3, lwd = 3)  # plots the points that represent the cluster centers

Principal components analysis and singular decomposition

Not much to report here, lol.

Working with color in R

library(grDevices) ##takes blending of primary and other colors and blend them together. 
# colorRamp  (e.g. gray)
# colorRampPalette  (e.g. heat.colors)
# colors()
color1 <- colorRamp(c("red", "blue"))
color1(0) ##will  return a matrix of three values - Red  Green  Blue in that order, where it's all red.
##      [,1] [,2] [,3]
## [1,]  255    0    0
color1(1) ##will return a matrix that is all blue. 
##      [,1] [,2] [,3]
## [1,]    0    0  255
color1(0.5) # will return a blend of both red and blue, but no green. 
##       [,1] [,2]  [,3]
## [1,] 127.5    0 127.5
    color1(seq(0,1, len = 10))   ## generated a sequence, length of 10, of a sequence between all red to all blue. 
##            [,1] [,2]      [,3]
##  [1,] 255.00000    0   0.00000
##  [2,] 226.66667    0  28.33333
##  [3,] 198.33333    0  56.66667
##  [4,] 170.00000    0  85.00000
##  [5,] 141.66667    0 113.33333
##  [6,] 113.33333    0 141.66667
##  [7,]  85.00000    0 170.00000
##  [8,]  56.66667    0 198.33333
##  [9,]  28.33333    0 226.66667
## [10,]   0.00000    0 255.00000
library(RColorBrewer)
    cols <- brewer.pal(15, "RdYlGn")  ##brewer.pal the only useful argument in this package. colorbrewer.org has the list of available color shchema. 
## Warning in brewer.pal(15, "RdYlGn"): n too large, allowed maximum for palette RdYlGn is 11
## Returning the palette you asked for with that many colors
    pal <-colorRampPalette(cols)  ##rendering the colors into Ramp Pallete
    image(volcano, col= pal(15))  ##asking the colors to be those rendered into pal, and there will be 10 different colors. 

x <- rnorm(1000)
y <- rnorm(1000)
plot(x, y, col = rgb(.1,0,.3,.2), pch = 19)  ##RGB looks at red, green, blue as numeric, and last is the alpha of 0.2

Samsung phone- accelerometer example.

list.files("./data/UCI HAR Dataset/")
## character(0)
datapath <-file.path("/Users/payashome/Documents/FMDtRH/R Studio/R Tutorials/R_4_DataScience/data", "UCI HAR Dataset 2")
    files <- list.files(datapath, recursive = TRUE)  ##lists all files in the UCI folder
    files
##  [1] "activity_labels.txt"                         
##  [2] "features_info.txt"                           
##  [3] "features.txt"                                
##  [4] "README.txt"                                  
##  [5] "test/Inertial Signals/body_acc_x_test.txt"   
##  [6] "test/Inertial Signals/body_acc_y_test.txt"   
##  [7] "test/Inertial Signals/body_acc_z_test.txt"   
##  [8] "test/Inertial Signals/body_gyro_x_test.txt"  
##  [9] "test/Inertial Signals/body_gyro_y_test.txt"  
## [10] "test/Inertial Signals/body_gyro_z_test.txt"  
## [11] "test/Inertial Signals/total_acc_x_test.txt"  
## [12] "test/Inertial Signals/total_acc_y_test.txt"  
## [13] "test/Inertial Signals/total_acc_z_test.txt"  
## [14] "test/subject_test.txt"                       
## [15] "test/X_test.txt"                             
## [16] "test/y_test.txt"                             
## [17] "train/Inertial Signals/body_acc_x_train.txt" 
## [18] "train/Inertial Signals/body_acc_y_train.txt" 
## [19] "train/Inertial Signals/body_acc_z_train.txt" 
## [20] "train/Inertial Signals/body_gyro_x_train.txt"
## [21] "train/Inertial Signals/body_gyro_y_train.txt"
## [22] "train/Inertial Signals/body_gyro_z_train.txt"
## [23] "train/Inertial Signals/total_acc_x_train.txt"
## [24] "train/Inertial Signals/total_acc_y_train.txt"
## [25] "train/Inertial Signals/total_acc_z_train.txt"
## [26] "train/subject_train.txt"                     
## [27] "train/X_train.txt"                           
## [28] "train/y_train.txt"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## We are going to read in the train, test, features and activities as seperate
#setwd("./data/UCI HAR Dataset 2")
x_train <- read.table(file.path(datapath, "train", "X_train.txt"), header = FALSE)  ##here, file.path saves time by allowing the UCI HAR to be accesses as datapath, in place of pasting the complete filepath. 
  y_train <- read.table(file.path(datapath, "train", "Y_train.txt"), header = FALSE) ## reading in the y train data
      train_sub <- read.table(file.path(datapath, "train", "subject_train.txt"), header = FALSE)
##_________________
x_test <- read.table(file.path(datapath, "test", "X_test.txt"), header = FALSE)  ##reading in data for test, same as above. 
  y_test <- read.table(file.path(datapath, "test", "Y_test.txt"), header = FALSE)
      test_sub <- read.table(file.path(datapath, "test", "subject_test.txt"), header = FALSE)
##_________________
features <- read.table(file.path(datapath, "features.txt"), header = FALSE) ##reading in additional files, no files denoted because they're already in datapath.
actLabel <- read.table(file.path(datapath, "activity_labels.txt"), header = FALSE)

colnames(actLabel) <- c('activityID', 'activityTYPE')
colnames(x_train) = features[ ,2] ##defining the column names as a function of features, which has two columns, so the column name for x_train will come from the list of names in column 2 of features. 
  colnames(y_train) = "activityID"
      y_train <- left_join(y_train, actLabel, by = "activityID")  ##Left joining two data.frames using actLabel as the key, where both column ID's have to match. 
      colnames(train_sub) = "subjectID"

colnames(x_test) = features[ ,2] ##column names are same as in train. 
  colnames(y_test) = "activityID" #descriptive column names for the activity type. 
    y_test <- left_join(y_test, actLabel, by ="activityID")
      colnames(test_sub) = "subjectID" #descriptive column name for the individual, i.e. 1 of 30 participants. 
        ##simply giving column names to the actLabel data.frame
      

combine_train <- cbind(y_train, train_sub, x_train)  ##when str(combine_train), we see activityID as first column (y_train), then the participant ID, then the numerous data columns (x_train)
combine_test <- cbind(y_test, test_sub, x_test)
complete_data <- rbind(combine_train, combine_test) ##row binds the two datasets together. 


names(complete_data) <- gsub("^t", "time", names(complete_data))  ##any string (i.e. column name) beginning with a t changed to time. 
names(complete_data) <- gsub("^f", "frequency", names(complete_data))  ## same for frequency
names(complete_data) <- gsub("Acc", "Accelerometer", names(complete_data))
names(complete_data) <- gsub("Gyro", "Gyroscope", names(complete_data))
names(complete_data) <- gsub("Mag", "Magnitude", names(complete_data))
names(complete_data) <- gsub("BodyBody", "body", names(complete_data)) 

write.table(complete_data, file ="tidydata_samsung", row.name=FALSE)
head(complete_data, n=20)
##___________
par(mfrow = c(1,2), mar = c(5,4,2,1))
complete_data <- transform(complete_data, activityTYPE = factor(activityTYPE))
    sub1 <- subset(complete_data, subjectID == 1)
      plot(sub1[, 4], col= sub1$activityID, ylab = names(sub1)[4])
      plot(sub1[, 5], col = sub1$activityID, ylab = names(sub1)[5])
        op <- par(cex = .5)  ##changes the font size for the legend of the plot. 
              legend("bottomright", legend = unique(sub1$activityTYPE), col = unique(sub1$activityTYPE), pch = 1)

par(mfrow = c(1,2), mar = c(5,4,2,1))  ##looking at the max acceleration in x and y for subject 1 
complete_data <- transform(complete_data, activityTYPE = factor(activityTYPE))
    sub1 <- subset(complete_data, subjectID == 1)
      plot(sub1[, 13], col= sub1$activityID, ylab = names(sub1)[13])
      plot(sub1[, 14], col = sub1$activityID, ylab = names(sub1)[14])
        op <- par(cex = .5)  ##changes the font size for the legend of the plot. 
              legend("topright", legend = unique(sub1$activityTYPE), col = unique(sub1$activityTYPE), pch = 19)

EPA Air Quality Example- Fine particulate matter.

http://goo.gl/soQZHM

pm0 <- read.table("./data/PM2-5_2000_2019_annual.txt", header = TRUE, sep= "|", na.strings = "NA", fill = TRUE) ##for a read.table, it expects an element in each column, therefore it is crucial to fill = TRUE in order to fill it in, otherwise it won't be read. 
head(pm0, n= 20)
library(dplyr)
    
    pm0 <- pm0[pm0$year %in% c("2000","2019"), ] ##subsetting the data.frame to include two years only. careful, year is denoted as an integer here. strptime not working well for a conversion. 
      #names(pm0) <- names(pm0) %>% make.names() ##renames column names due to abundance of spaces. Fills them in with "_". 
        head(pm0, n=20)
        ##taking summary view of sample values. 
        x0 <- pm0$arithmetic_mean
        summary(x0)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.915   7.039   8.833   9.895  12.563  28.220
        str(x0)
##  num [1:1309] 11.51 6.77 6.77 10.29 7.92 ...
        mean(is.na(x0))
## [1] 0
        ##subsetting the data into two data.frames, and then boxplotting each on same plot. 
        pm0$year <- as.integer(pm0$year)
            x2000 <- subset(pm0, pm0$year == 2000)
            x2019 <- subset(pm0, pm0$year == 2019)
              boxplot( x2000$arithmetic_mean, x2019$arithmetic_mean, col="blue", pch = 19)

## ______________ Drilling down into the data_______________##            
        negative <- x2000$arithmetic_mean < 0
        str(negative)
##  logi [1:565] FALSE FALSE FALSE FALSE FALSE FALSE ...
        sum(negative, na.rm = TRUE)
## [1] 0
        mean(negative, na.rm = TRUE)
## [1] 0
            hist(x2000$arithmetic_mean, xlab = "PM2.5 (micrograms/cubic meter)")  ##getting a view of the data

            site2000 <- unique(subset(x2000, state_code == 36, c(county_code, si_id))) ##returns two column data.frame unique to the state, and showing county and site id's. 
              head(site2000)
              site2000 <- paste(site2000[,1], site2000[,2], sep = ".")  ##pasting together string/number from col 1 to col 2, returning a "number.number" that is the combination of the two. 
        
        negative2 <- x2019$arithmetic_mean < 0
        str(negative)
##  logi [1:565] FALSE FALSE FALSE FALSE FALSE FALSE ...
        sum(negative, na.rm = TRUE)
## [1] 0
        mean(negative, na.rm = TRUE)
## [1] 0
            hist(x2019$arithmetic_mean, xlab = "PM2.5 (micrograms/cubic meter)")

            site2019 <- unique(subset(x2019, state_code == 36, c(county_code, si_id))) ##returns two column data.frame unique to the state, and showing county and site id's. 
              head(site2019)
              site2019 <- paste(site2019[,1], site2019[,2], sep = ".")
                  both <- intersect(site2019, site2000)  ##here- looks at the county, and site monitors that were present in both 2000 and 2019, and the return shows that there were 7. 
                  str(both)
##  chr [1:7] "085.10983" "067.10901" "005.10494" "031.10632" "001.10461" ...
                  x2000$county_site <- with(x2000, paste(county_code, si_id, sep = ".")) ##creating new columns that are the combo of county and site (from above)
                  x2019$county_site <- with(x2019, paste(county_code, si_id, sep = "."))

                      ct2000 <- subset(x2000, state_code == 36 & county_site %in% both)
                      ct2019 <- subset(x2019, state_code == 36 & county_site %in% both)
                      
                      sapply(split(ct2000, ct2000$county_site), nrow) ## if these were not annual averages, but a collection of dates and n number of observations, the sapply on the split would show us how many observations we had per  station in "both".
## 001.10461 001.10468 005.10494 031.10632 061.10801 067.10901 085.10983 
##         1         1         2         1         1         1         1
                      sapply(split(ct2019, ct2019$county_site), nrow)
## 001.10461 001.10468 005.10494 031.10632 061.10801 067.10901 085.10983 
##         1         1         1         1         1         2         1

EPA Air Quality Example #2- PM2.5

key <- readRDS("Source_Classification_Code.rds")
data0 <- readRDS("summarySCC_PM25.rds")
    head(key, n=20)
    head(data0, n=20)
        str(key)
## 'data.frame':    11717 obs. of  15 variables:
##  $ SCC                : Factor w/ 11717 levels "10100101","10100102",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Data.Category      : Factor w/ 6 levels "Biogenic","Event",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ Short.Name         : Factor w/ 11238 levels "","2,4-D Salts and Esters Prod /Process Vents, 2,4-D Recovery: Filtration",..: 3283 3284 3293 3291 3290 3294 3295 3296 3292 3289 ...
##  $ EI.Sector          : Factor w/ 59 levels "Agriculture - Crops & Livestock Dust",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ Option.Group       : Factor w/ 25 levels "","C/I Kerosene",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Option.Set         : Factor w/ 18 levels "","A","B","B1A",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ SCC.Level.One      : Factor w/ 17 levels "Brick Kilns",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ SCC.Level.Two      : Factor w/ 146 levels "","Agricultural Chemicals Production",..: 32 32 32 32 32 32 32 32 32 32 ...
##  $ SCC.Level.Three    : Factor w/ 1061 levels "","100% Biosolids (e.g., sewage sludge, manure, mixtures of these matls)",..: 88 88 156 156 156 156 156 156 156 156 ...
##  $ SCC.Level.Four     : Factor w/ 6084 levels "","(NH4)2 SO4 Acid Bath System and Evaporator",..: 4455 5583 4466 4458 1341 5246 5584 5983 4461 776 ...
##  $ Map.To             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Last.Inventory.Year: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Created_Date       : Factor w/ 57 levels "","1/27/2000 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Revised_Date       : Factor w/ 44 levels "","1/27/2000 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Usage.Notes        : Factor w/ 21 levels ""," ","includes bleaching towers, washer hoods, filtrate tanks, vacuum pump exhausts",..: 1 1 1 1 1 1 1 1 1 1 ...
        str(data0)
## 'data.frame':    6497651 obs. of  6 variables:
##  $ fips     : chr  "09001" "09001" "09001" "09001" ...
##  $ SCC      : chr  "10100401" "10100404" "10100501" "10200401" ...
##  $ Pollutant: chr  "PM25-PRI" "PM25-PRI" "PM25-PRI" "PM25-PRI" ...
##  $ Emissions: num  15.714 234.178 0.128 2.036 0.388 ...
##  $ type     : chr  "POINT" "POINT" "POINT" "POINT" ...
##  $ year     : int  1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
            summary(data0)
##      fips               SCC             Pollutant           Emissions       
##  Length:6497651     Length:6497651     Length:6497651     Min.   :     0.0  
##  Class :character   Class :character   Class :character   1st Qu.:     0.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :     0.0  
##                                                           Mean   :     3.4  
##                                                           3rd Qu.:     0.1  
##                                                           Max.   :646952.0  
##      type                year     
##  Length:6497651     Min.   :1999  
##  Class :character   1st Qu.:2002  
##  Mode  :character   Median :2005  
##                     Mean   :2004  
##                     3rd Qu.:2008  
##                     Max.   :2008
library(plotrix)  ##allows us to have a break in the 
#with(data0, gap.plot(year, Emissions, gap=c(10,100), col= year, pch = 1, ylim = c(-1, 200), xlab = "Year", ylab = "Emissions", main ="Emissions over time- USA"))
    #legend("topright", pch = 1, col = unique(data0$year), legend = unique(data0$year))
    #model <- lm(Emissions ~ year, data0) ##creating a liner model between two variables
    #abline(model, col = "purple", lty = 2)
        #scipen = 5   ##for turning off scientific notation. 
        #dev.copy(png, file = "plot1.png")
        #dev.off()
        
        
data1 <- subset(data0, fips == "24510")
with(data1, gap.plot(year, Emissions, gap=c(10,100), col= year, pch = 1, ylim = c(-1, 200), xlab = "Year", ylab = "Emissions", main ="Emissions over time- Baltimore City"))
## Warning in gap.plot(year, Emissions, gap = c(10, 100), col = year, pch = 1, :
## some values of y may not be displayed
    legend("topright", pch = 1, col = unique(data1$year), legend = unique(data1$year))
    model <- lm(Emissions ~ year, data1) ##creating a liner model between two variables
    abline(model, col = "purple", lty = 2)

        scipen = 5   ##for turning off scientific notation. 
        dev.copy(png, file = "plot2.png")
## quartz_off_screen 
##                 3
        dev.off()
## quartz_off_screen 
##                 2
 ###________________________        
library(ggplot2)   
data2 <- subset(data0, fips == "24510" & type == "POINT")
data3 <- subset(data0, fips == "24510" & type == "NONPOINT")
data4 <- subset(data0, fips == "24510" & type == "ON-ROAD")
data5 <- subset(data0, fips == "24510" & type == "NON-ROAD")

library(RColorBrewer)
    cols <- brewer.pal(6, "RdYlGn")  ##brewer.pal the only useful argument in this package. colorbrewer.org has the list of available color shchema. 
    pal <-colorRampPalette(cols)  ##rendering the colors into Ramp Pallete
    
  
g1 <- ggplot(data2, aes(year, Emissions)) 
p1 <- g1 + geom_point(aes(color = year), size = 4, alpha= 1/2) +   ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that. 
          theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot        
            scale_fill_brewer(palette = "RdYlBu") +  ##attempt to use brewer color schemes... doesn't seem to work. 
            geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") + 
            xlab("Year") + ##labels the x axis
            ylab("Emissions") +
            ggtitle("Baltimore City Emissions by 'POINT'") +
            ylim(-1, 30)

g2 <- ggplot(data3, aes(year, Emissions)) 
p2 <- g2 + geom_point(aes(color = year), size = 4, alpha= 1/2) +   ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that. 
          theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot        
            scale_fill_brewer(palette = "RdYlBu") +  ##attempt to use brewer color schemes... doesn't seem to work. 
            geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") + 
            xlab("Year") + ##labels the x axis
            ylab("Emissions") +
            ggtitle("Baltimore City Emissions by 'NONPOINT'") +
            ylim(-1, 30)

g3 <- ggplot(data4, aes(year, Emissions)) 
p3 <- g3 + geom_point(aes(color = year), size = 4, alpha= 1/2) +   ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that. 
          theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot        
            scale_fill_brewer(palette = "RdYlBu") +  ##attempt to use brewer color schemes... doesn't seem to work. 
            geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") + 
            xlab("Year") + ##labels the x axis
            ylab("Emissions") +
            ggtitle("Baltimore City Emissions by 'ONROAD'") +
            ylim(-1, 30)

g4 <- ggplot(data5, aes(factor(year), Emissions)) 
p4 <- g4 + geom_point(aes(color = year), size = 4, alpha= 1/2) +   ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that. 
          theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot        
            scale_fill_brewer(palette = "RdYlBu") +  ##attempt to use brewer color schemes... doesn't seem to work. 
            geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm", na.rm= TRUE) + 
            xlab("Year") + ##labels the x axis
            ylab("Emissions") +
            ggtitle("Baltimore City Emissions by 'NONROAD'") +
            ylim(-1, 30)
library(cowplot)

plot_grid(p1, p2, p3, p4, labels= "", ncol=2, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 19 rows containing non-finite values (stat_smooth).
## Warning: Removed 19 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing missing values (geom_point).

dev.copy(png, file = "plot3.png")
## quartz_off_screen 
##                 3
        dev.off()
## quartz_off_screen 
##                 2
#####___________ Alternative ggplot
data1 <- subset(data0, fips == "24510") ##subset for baltimore
agdata1 <- aggregate(Emissions ~ year, data1, sum)  ##not taht important for this plot.
gg1 <- ggplot(data1, aes(factor(year), Emissions, fill=type)) ##type is a categorical variable/column header. 
gp1 <- gg1 + geom_bar(stat="identity") + ##identity leave the data unchanged. The default is to count
      theme_bw() +
      guides(fill = "none") + ##gets rid of the legend on the right side. 
      facet_grid(.~type, scales = "free", space = "free")+  ##grid is organized by "." or everything, by type(non-road, etc...). Separates 4, 4-olored columns into four grids, each with 4 colors. 
      xlab("Year") + ##labels the x axis
      ylab("Emissions")+
      ggtitle("PM2.5 Emissions, Baltimore City")

gp1

        library(plotrix)  ##allows us to have a break in the
        library(data.table)
        library(dplyr)
      data6 <- left_join(data0, key, by = "SCC")
data7 <- data6[data6$SCC.Level.Three %like% "coal", ]
with(data7, gap.plot(year, Emissions, gap=c(100,200), col= year, pch = 1, ylim = c(-1, 300), xlab = "Year", ylab = "Emissions", main ="Coal Use over time- USA"))
## Warning in gap.plot(year, Emissions, gap = c(100, 200), col = year, pch = 1, :
## some values of y may not be displayed
    legend("topright", pch = 1, col = unique(data6$year), legend = unique(data6$year))
    model <- lm(Emissions ~ year, data7) ##creating a liner model between two variables
    abline(model, col = "purple", lty = 2)

        scipen = 5   ##for turning off scientific notation. 
        dev.copy(png, file = "plot4.png")
## quartz_off_screen 
##                 3
        dev.off()
## quartz_off_screen 
##                 2
        library(plotrix)  ##allows us to have a break in the
        library(data.table)
        library(dplyr)
        par(mfrow = c(1,1))
data6 <- left_join(data0, key, by = "SCC")
  data8 <- subset(data6, fips == "24510" & SCC.Level.One == "Internal Combustion Engines")
      with(data8, plot(year, Emissions, col= year, pch = 1, cex = 3, xlab = "Year", ylab = "Emissions", main ="Combusion Engine Part. over time- Maryland City"))  ##cex changes the size of the points. 
          legend("topright", pch = 1, col = unique(data8$year), legend = unique(data8$year))
          model <- lm(Emissions ~ year, data8) ##creating a liner model between two variables
              abline(model, col = "purple", lty = 2)

                scipen = 5   ##for turning off scientific notation. 
                dev.copy(png, file = "plot5.png")
## quartz_off_screen 
##                 3
                dev.off()
## quartz_off_screen 
##                 2
          par(mfrow = c(2,1))
                data6 <- left_join(data0, key, by = "SCC")
  data9 <- subset(data6, fips == ("24510") & SCC.Level.One == "Internal Combustion Engines")
      with(data9, plot(year, Emissions, col= fips, pch = 1, xlab = "Year", ylab = "Emissions", main ="Combusion Engine Part. Maryland City"))
          legend("topright", pch = 1, col = unique(data9$fips), legend = unique(data9$fips))
          model <- lm(Emissions ~ year, data9) ##creating a liner model between two variables
          abline(model, col = "purple", lty = 2)
                scipen = 5
 
   data10 <- subset(data6, fips == ("06037") & SCC.Level.One == "Internal Combustion Engines")
      with(data10, plot(year, Emissions, col= fips, pch = 1, xlab = "Year", ylab = "Emissions", main ="Combusion Engine Part. LA County"))
          legend("topright", pch = 1, col = unique(data10$fips), legend = unique(data10$fips))
          model <- lm(Emissions ~ year, data10)
              abline(model, col = "orange", lty = 2)

                scipen = 5   ##for turning off scientific notation. 
                
                dev.copy(png, file = "plot6.png")
## quartz_off_screen 
##                 3
                dev.off()
## quartz_off_screen 
##                 2

Steps/Intervals Example

Reading in the data.frame

data0 <- na.omit(read.csv("./data/activity.csv")) ##reading in data
data0$date <- as.Date(data0$date)
str(data0)
## 'data.frame':    15264 obs. of  3 variables:
##  $ steps   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ date    : Date, format: "2012-10-02" "2012-10-02" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2304] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:2304] "1" "2" "3" "4" ...
head(data0, n=20)

Total Steps per day

s <- split(data0, data0$date)  
    sapply(s, function(x) colSums(x[, c("steps", "interval")], na.rm = TRUE)) ##calculates total steps for eacy day. 
##          2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07
## steps           126      11352      12116      13294      15420      11015
## interval     339120     339120     339120     339120     339120     339120
##          2012-10-09 2012-10-10 2012-10-11 2012-10-12 2012-10-13 2012-10-14
## steps         12811       9900      10304      17382      12426      15098
## interval     339120     339120     339120     339120     339120     339120
##          2012-10-15 2012-10-16 2012-10-17 2012-10-18 2012-10-19 2012-10-20
## steps         10139      15084      13452      10056      11829      10395
## interval     339120     339120     339120     339120     339120     339120
##          2012-10-21 2012-10-22 2012-10-23 2012-10-24 2012-10-25 2012-10-26
## steps          8821      13460       8918       8355       2492       6778
## interval     339120     339120     339120     339120     339120     339120
##          2012-10-27 2012-10-28 2012-10-29 2012-10-30 2012-10-31 2012-11-02
## steps         10119      11458       5018       9819      15414      10600
## interval     339120     339120     339120     339120     339120     339120
##          2012-11-03 2012-11-05 2012-11-06 2012-11-07 2012-11-08 2012-11-11
## steps         10571      10439       8334      12883       3219      12608
## interval     339120     339120     339120     339120     339120     339120
##          2012-11-12 2012-11-13 2012-11-15 2012-11-16 2012-11-17 2012-11-18
## steps         10765       7336         41       5441      14339      15110
## interval     339120     339120     339120     339120     339120     339120
##          2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 2012-11-24
## steps          8841       4472      12787      20427      21194      14478
## interval     339120     339120     339120     339120     339120     339120
##          2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## steps         11834      11162      13646      10183       7047
## interval     339120     339120     339120     339120     339120
        ##you can also use aggregate
data1 <- aggregate(data0$steps, by = list(Steps.Date = data0$date), FUN ="sum")
data1 <- as.data.frame(data1)
colnames(data1) <- c("Date", "Sum_Steps")
head(data1)
hist(data1$Sum_Steps, col= "orange", breaks=36, xlab="No. of Steps", main= "Sum of Steps per Day")

mean_step <- mean(data1$Sum_Steps)
mean_step
## [1] 10766.19
median_step <- median(data1$Sum_Steps)
median_step
## [1] 10765

Plotting Average Steps

data2 <- aggregate(data0$steps, by = list(Interval = data0$interval), FUN ="mean")
colnames(data2) <- c("Interval", "Average_Steps")
head(data2)
with(data2, plot(Interval, Average_Steps, col = "purple", 
                 main= "Average Daily Activity", 
                 xlab= "5min Intervals", 
                 ylab="Average no. Steps", 
                 type= "l"))

max <- which.max(data2$Average_Steps)
max2 <- data2[max,1]
max2
## [1] 835

Working with NA’s

dataNA <- read.csv("./data/activity.csv")
length(which(is.na(dataNA))) ##number of NA's
## [1] 2304
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
  dataFill <- dataNA
    dataFill$steps <- impute(dataNA$steps, fun=mean)


data3 <- aggregate(dataFill$steps, by = list(Date = dataFill$date), FUN ="sum")
data3 <- as.data.frame(data3)
    colnames(data3) <- c("Date", "Sum_Steps")
    head(data3)
hist(data3$Sum_Steps, col= "orange", breaks=36, xlab="No. of Steps", main= "Sum of Steps per Day")

mean_step <- mean(data3$Sum_Steps)
mean_step
## [1] 10766.19
median_step <- median(data3$Sum_Steps)
median_step
## [1] 10766.19

Weekday and Weekend

dataFill$date <- as.Date(dataFill$date)
dataFill$weekday <- weekdays(dataFill$date)
dataFill$weekend <- ifelse(dataFill$weekday=="Saturday" | dataFill$weekday=="Sunday", "Weekend", "Weekday" )
head(dataFill)
library(ggplot2)
data4 <- aggregate(dataFill$steps , by= list(dataFill$weekend, dataFill$interval), FUN = "mean") ##aggregating by interval, and weekend code.
data4 <- as.data.frame(data4)
    colnames(data4) <- c("Weekend", "Interval", "Mean_Steps")

ggplot(data4, aes(x=Interval, y=Mean_Steps, color=Weekend)) + 
  geom_line()+
  facet_grid(Weekend ~.) +
  xlab("Interval") + 
  ylab("Mean Steps") +
  ggtitle(" Average Number of Steps in Each Interval by Weekday/Weekend")

NOAA Storms example

storm <- read.csv("./data/StormData.csv") ##reading in the data
str(storm)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
summary(storm$FATALITIES) ##identify the max/min number of fatalities 583 is max
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0000   0.0168   0.0000 583.0000
summary(storm$INJURIES) ##1700 max injuries
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0000    0.0000    0.0000    0.1557    0.0000 1700.0000
summary(storm$PROPDMG)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   12.06    0.50 5000.00
summary(storm$CROPDMG)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.527   0.000 990.000
##Taking subset
storm1 <- subset(storm, FATALITIES > 100 | INJURIES > 500)

storm2 <- storm1[, c("EVTYPE", "INJURIES", "FATALITIES") ]

storm3 <- subset(storm, PROPDMG > 4500 | CROPDMG > 900)

storm4 <- storm3[, c("EVTYPE", "PROPDMG", "CROPDMG")]
storm2
library(ggplot2)
plot1 <- ggplot(storm1, aes(EVTYPE, INJURIES, fill=EVTYPE)) 
inj <- plot1 + geom_bar(stat = "identity") + 
   theme_bw() +
    theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
    xlab("Event Type")+
    ylab("Total Injuries")+
    ggtitle("Subset of Major Injuries by Event")
        inj  ##shows a bar graph, where tornado has the highest rate of injuries.

library(ggplot2)
plot2 <- ggplot(storm1, aes(EVTYPE, FATALITIES, fill=EVTYPE)) 
fat <- plot2 + geom_bar(stat = "identity") + 
   theme_bw() +
    theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
    xlab("Event Type")+
    ylab("Total Fatalities")+
    ggtitle("Subset of Fatalities by Event")
        fat ##Tornado also has the highest rates of fatalities..

library(ggplot2)
plot3 <- ggplot(storm3, aes(EVTYPE, CROPDMG, fill=EVTYPE)) 
cdmg <- plot3 + geom_bar(stat = "identity") + 
   theme_bw() +
    theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
    xlab("Event Type")+
    ylab("Damage")+
    guides(fill = "none")+
    ggtitle("Impact on Crops")
         ##shows a bar graph, where tornado has the highest rate of injuries.

library(ggplot2)
plot4 <- ggplot(storm3, aes(EVTYPE, PROPDMG, fill=EVTYPE)) 
pdmg <- plot4 + geom_bar(stat = "identity") + 
   theme_bw() +
    theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
    xlab("Event Type")+
    ylab("Damage")+
    guides(fill = "none")+
    ggtitle("Impact on Property")

library(cowplot)
title <- ggdraw() + draw_label("Economic Impacts of Major Storm Events", fontface='bold')
plotmain <- plot_grid(cdmg, pdmg, ncol=2, labels="AUTO")
finalplot <- plot_grid(title, plotmain, nrow=2, rel_heights = c(.2, 1, 1))

finalplot